# Commands to start an interactive session on the JHPCE cluster
qrsh -l mem_free=20G,h_vmem=20G
module load conda_R
cd /fastscratch/myscratch/akuo/alsf-filbin
R
library(here)
dataset = "premrna" # options = "premrna_mrna", "mrna", or "premrna"
First, we create a table with information about where each file (quantified counts from salmon) is. This will be used to create a SummarizedExperiment object.
# list tumor names
tumor_names <- list.files(here("sample_data"))[
!grepl("*.txt", list.files(here("sample_data")))]
unique_sf_paths <- NULL
for(tum in tumor_names){
ids <- list.files(here("sample_data", tum))
ids <- unique(stringr::str_sub(ids, end=-13))
ids <- here("salmon_quants", paste0(ids, "_quant"), "quant.sf")
unique_sf_paths <- c(unique_sf_paths, ids)
}
unique_sf_ids <- NULL
for(tum in tumor_names){
ids <- list.files(here("sample_data", tum))
ids <- unique(stringr::str_sub(ids, end=-13))
unique_sf_ids <- c(unique_sf_ids, ids)
}
coldata <- data.frame(files = unique_sf_paths, names = unique_sf_ids)
We also need to use the linkedTxome object to use tximeta properly, i.e. rowRanges(se) won’t be NULL and tximeta will be able to match the transcripts to the genes for us. Note: still doesn’t work
suppressPackageStartupMessages({
library(tximeta)
library(BiocFileCache)
})
# check if linkedTxome is already in the cache
bfcloc <- getTximetaBFC()
bfc <- BiocFileCache(bfcloc)
bfcinfo(bfc)
# if not, load linkedTxome json file
json_file <- here("salmon_files", "gencode.v32_salmon-index-v1.0.0.json")
loadLinkedTxome(json_file)
The only way I’ve figured out how to make this work for now is with skipMeta = TRUE.
se_file_name = here("salmon_quants", paste0("se_", dataset, ".rds"))
# coldata = coldata[1:2, ] # for testing
if(!file.exists(se_file_name)){
# Create SummarizedExperiment object
se <- tximeta(coldata, skipMeta = TRUE) # Takes a few minutes, file size = 597 MB
# se <- tximeta(coldata) # run this line if you used mRNA transcripts in your index only, it will automatically detect the right transcriptome
# se <- tximeta(coldata, ignoreAfterBar = TRUE)
saveRDS(se, se_file_name)
} else {
se = readRDS(se_file_name) # Takes a couple of seconds
}
suppressPackageStartupMessages({
library(SummarizedExperiment)
library(DESeq2)
})
colData(se)
assayNames(se)
rowRanges(se)
dat = assay(se, "counts")
A SingleCellExperiment class is derived from the SummarizedExperiment class. The most important change is the addition of a new slot called reducedDims. Read more here.
# BiocManager::install('SingleCellExperiment')
library(SingleCellExperiment)
sce = SingleCellExperiment(assays = list(counts = dat))
# you can access counts by assay(sce, "counts") or counts(sce)
# you can add a new entry to assays slot by assay(sce, "counts_new") = dat_new
There are a couple of QC metrics to identify low-quality cells:
low_lib_size or (b) few expressed endogeneous genes (nonzero counts for those genes) low_n_featuresI will only do (1) for now.
library(scater)
# Compute quality control metrics:
# sum is the total count for each cell
# detected contains the number of detected genes (actually transcripts for our data)
df = perCellQCMetrics(sce)
df
# Find outliers with low library sizes (LibSize) and few detected features (n_features)
reasons = quickPerCellQC(df) # DataFrame of logical values
colSums(as.matrix(reasons))
# Discard outliers
filtered = sce[, !reasons$discard]
dat_filtered = counts(filtered) # 226608 x 518 for mrna, 221988 x 546 for premrn
sce_filtered = SingleCellExperiment(assays = list(counts = dat_filtered))
filtered_file_name = here("salmon_quants", paste0("sce_filtered_", dataset, ".rds"))
saveRDS(sce_filtered, filtered_file_name)
Diagnostic plots: https://osca.bioconductor.org/quality-control.html#quality-control-plots
library(here)
library(ggplot2)
library(dplyr)
library(tidyr)
library(SingleCellExperiment)
## Warning: package 'SummarizedExperiment' was built under R version 3.6.2
## Warning: package 'S4Vectors' was built under R version 3.6.2
## Warning: package 'IRanges' was built under R version 3.6.2
## Warning: package 'DelayedArray' was built under R version 3.6.2
## Warning: package 'BiocParallel' was built under R version 3.6.2
dataset = "mrna"
Run compute-mle.sh to get mle sig (“shape”) parameter first. There will be a parameter for every cell. It will take about 30 minutes to an hour.
# Load filtered counts
filtered_file_name = here("salmon_quants", paste0("sce_filtered_", dataset, ".rds"))
if(file.exists(filtered_file_name)){
sce_filtered = readRDS(filtered_file_name)
dat_filtered = counts(sce_filtered)
}
# Read in mle parameters
mle_file_names = list.files(here("mle_results"), full.names = TRUE)
mle_results = lapply(mle_file_names, readRDS)
sig_vec = sapply(mle_results, function(r) r["sig"])
mu_vec = sapply(mle_results, function(r) r["mu"])
# Plot sig ("shape") distribution
ggplot(tibble(sig = sig_vec), aes(x = sig)) +
geom_histogram(bins = 30) +
labs(title = "Distribution of sig parameter",
x = "sig",
y = "Number of cells")
getMode <- function(x){
keys <- unique(x)
keys[which.max(tabulate(match(x, keys)))]
}
source(here("scripts", "quminorm.R"))
# Convert to pseudo-UMIs
umi_file_name = here("salmon_quants", paste0("sce_umi_", dataset, ".rds"))
if(file.exists(umi_file_name)){
dat_umi = readRDS(umi_file_name)
} else {
dat_umi = quminorm_matrix(dat_filtered, shape = getMode(sig_vec)) # Takes several minutes (20~30 min). In the paper, they use the mode.
dat_umi = dat_umi[!(rowSums(dat_umi) == 0), ] # Remove transcripts with all zeros
sce_umi = SingleCellExperiment(assays = list(counts = dat_umi))
saveRDS(sce_umi, umi_file_name)
}
umi_file_name = here("salmon_quants", paste0("sce_umi_", dataset, ".rds"))
sce_umi = readRDS(umi_file_name)
dat_umi = counts(sce_umi)
# Counts by transcript
dim(dat_umi) # 176307 x 518 for mRNA, 173927 x 546 for premRNA
if(dataset == "mrna"){
se_mrna = readRDS(here("salmon_quants", paste0("se_mrna.rds")))
transcripts_mrna = rownames(assay(se_mrna, "counts"))
row_ranges = rowRanges(se)
genes = unlist(elementMetadata(row_ranges)$gene_id)
mrna_gene_matching_dt = tibble(transcript = transcripts_mrna, genes) %>%
filter(transcript %in% rownames(dat_umi))
dat_gene = rowsum(dat_umi, group = mrna_gene_matching_dt$genes, reorder = FALSE)
dim(dat_gene) # 48890 x 518 for mRNA
} else if(dataset == "premrna") {
# Read in mrna data to look up genes
se_mrna = readRDS(here("salmon_quants", paste0("se_mrna.rds")))
transcripts_mrna = rownames(assay(se_mrna, "counts"))
row_ranges = rowRanges(se_mrna)
genes = unlist(elementMetadata(row_ranges)$gene_id)
mrna_gene_matching_dt = tibble(transcript = transcripts_mrna, genes)
# Get genes for every premrna transcript
# Note: 208 premRNA transcripts did not have corresponding mRNA transcripts
transcripts_premrna = rownames(dat_umi)
transcripts_premrna_converted = gsub(".{8}$", "" , transcripts_premrna)
premrna_gene_matching_dt = left_join(tibble(transcript = transcripts_premrna_converted),
mrna_gene_matching_dt, by = "transcript")
dat_gene = rowsum(dat_umi, group = premrna_gene_matching_dt$genes, reorder = FALSE)
# Remove counts for unmatched transcripts
dat_gene = dat_gene[!is.na(rownames(dat_gene)), ]
dim(dat_gene) # 50476 x 546
}
gene_file_name = here("salmon_quants", paste0("sce_gene_", dataset, ".rds"))
sce_gene = SingleCellExperiment(assays = list(counts = dat_gene))
saveRDS(sce_gene, gene_file_name)
library(here)
library(ggplot2)
library(dplyr)
library(tidyr)
library(SingleCellExperiment)
sce_prem = readRDS(here("salmon_quants", "sce_gene_premrna.rds"))
sce_mrna = readRDS(here("salmon_quants", "sce_gene_mrna.rds"))
dat_prem = counts(sce_prem)
dat_mrna = counts(sce_mrna)
dat_prem_rowmeans = tibble(genes = rownames(dat_prem),
mean_prem = rowMeans(dat_prem))
dat_mrna_rowmeans = tibble(genes = rownames(dat_mrna),
mean_mrna = rowMeans(dat_mrna))
dat_rowmeans = full_join(dat_prem_rowmeans, dat_mrna_rowmeans, by = "genes")
dat_rowmeans %>%
ggplot(aes(x = log(mean_mrna),
y = log(mean_prem))) +
geom_point(alpha = 0.5) +
geom_abline(intercept = 0, slope = 1, color = "red") +
labs(x = "Log(average gene expression with mRNA)",
y = "Log(average gene expression with pre-mRNA)") +
theme_bw()
## Warning: Removed 4924 rows containing missing values (geom_point).
First, I make all the column sums the same by scaling. The count in row \(i\) and column \(j\) is transformed as \(x_{ij} = x_{ij}*\sum_i x_{ij}/median_j(\sum_i x_{ij})\).
summary(colSums(dat_prem))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2101 18112 34582 41200 56534 309985
# Apply transformation
n = round(median(colSums(dat_prem)))
dat_prem = apply(dat_prem, MARGIN = 2, function(x) x*n/sum(x))
summary(colSums(dat_prem))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34582 34582 34582 34582 34582 34582
I plot the mean and variance for every row (transcript). Under a poisson distribution, they should be the same.
plot_meanvar = function(dat_sub){
# estimate lambdas and variances for every transcript
means = rowMeans(dat_sub)
vars = apply(dat_sub, MARGIN = 1, var)
# variance = mean under Poisson distribution
tibble(means, vars) %>%
ggplot(aes(x = log(means), y = log(vars))) +
geom_point(alpha = 0.4) +
geom_abline(intercept = 0, slope = 1)
}
plot_meanvar(dat_prem) +
labs(title = "pre-mRNA")
I first compute the average expression for every row (x-axis) \(\hat{\mu}_i\) and the empirical \(P(X_i=0)\), which is the probability that for a given transcript \(i\), the count is 0.
I then compute what \(P(X_i=0)\) would be under the model assumptions of binomial or poisson, using parameters estimated from the data. In particular, for a \(Binom(n, p_i)\), \(n\) is the median number of total counts of cells and \(p_i\) is the mean proportion of counts that were in gene \(i\).
# Function to plot P(X_i = 0) against average expression level mu_i
plot_prob = function(dat_sub){
n = round(median(colSums(dat_sub)))
means = rowMeans(dat_sub)
vars = apply(dat_sub, MARGIN = 1, var)
# empirical P(X_i = 0)
emp_probs_0 = apply(dat_sub, MARGIN = 1, function(r) sum(r==0)/ncol(dat_sub))
plot_dt = tibble(means, emp_probs_0)
emp_props = rowMeans(dat_sub/colSums(dat_sub))
# Model P(X_i = 0) under Binomial
binom_probs_0 = dbinom(x = 0, size = n, prob = emp_props)
# Model P(X_i = 0) under Poisson
poiss_probs_0 = dpois(x = 0, lambda = n*emp_props)
# Model P(X_i = 0) under Negative Binomial
# Estimate size/dispersion parameter
model = lm(vars ~ 1*means + I(means^2) + 0, tibble(means, vars))
phi = 1/coef(model)["I(means^2)"]
nbinom_probs_0 = dnbinom(x = 0, size = phi, mu = n*emp_props)
# Tibble for plot
plot_lines_dt = tibble(means = means,
binomial = binom_probs_0,
poisson = poiss_probs_0,
nbinomial = nbinom_probs_0) %>%
pivot_longer(-means, names_to = "model", values_to = "probs_0")
# Plot
plt = plot_lines_dt %>%
ggplot(aes(x = log(means), y = probs_0)) +
geom_point(data = plot_dt, aes(x = log(means), y = emp_probs_0), alpha = 0.4) + # Add data points
geom_line(aes(color = model),
size = 1.5) + # Add lines for models
labs(x = "Average expression level log(E(X_i))",
y = "P(X_i = 0)") +
theme(text = element_text(size = 15))
# Return object
out = list(plot = plt,
lines_dt = plot_lines_dt)
return(out)
}
# Plot P(X_i = 0) against average expression level
prob_out_prem = plot_prob(dat_prem)
prob_out_prem$plot +
labs(title = "pre-mRNA")
summary(colSums(dat_mrna))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1781 19750 42298 52722 71039 481647
# Apply transformation
n = round(median(colSums(dat_mrna)))
dat_mrna = apply(dat_mrna, MARGIN = 2, function(x) x*n/sum(x))
summary(colSums(dat_mrna))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 42298 42298 42298 42298 42298 42298
# Plot mean against variance
plot_meanvar(dat_mrna) +
labs(title = "mRNA")
# Plot P(X_i = 0) against average expression level
prob_out_mrna = plot_prob(dat_mrna)
prob_out_mrna$plot +
labs(title = "mRNA")
library(scran)
## Warning: package 'scran' was built under R version 3.6.2
library(BiocSingular)
## Warning: package 'BiocSingular' was built under R version 3.6.2
# library(factoextra)
library(tictoc)
library(glmpca)
sce = sce_prem
dat = dat_prem
# Cells in each cluster are normalized separately
clust = quickCluster(dat)
table(clust)
## clust
## 1 2 3 4
## 178 140 109 119
deconv = computeSumFactors(dat, cluster = clust)
## Warning: use 'calculateSumFactors' for any 'x' that is not a
## SingleCellExperiment
X = dat %>%
as.matrix() %>%
sweep(., 2, deconv, "/") %>% # Divide by library size
t()
X = log2(X + 1) # Apply log transformation
# Remove columns (transcripts) with zero variance
nonzero_var_cols = which(apply(X, 2, var) != 0)
X = X[, nonzero_var_cols]
# Run approximate PCA (8 seconds)
tic("approx PCA")
pc = runPCA(X, rank = 5, BSPARAM = IrlbaParam())
toc()
## approx PCA: 1.686 sec elapsed
# set.seed(1)
# Run random PCA (2 minutes)
# tic("random PCA")
# pc = runPCA(X, rank = 5, BSPARAM = RandomParam())
# toc()
# Run PCA with prcomp (~10 minutes)
# pc = prcomp(X, center = T, scale = F)
# % variance explained by each PC
# fviz_eig(pc)
tumor_labels = gsub("-.*$", "", colnames(sce))
# Plot PC
data.frame(PC1 = pc$x[,1], PC2 = pc$x[,2], color = tumor_labels) %>%
ggplot(aes(PC1, PC2, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC1 = pc$x[,1], PC3 = pc$x[,3], color = tumor_labels) %>%
ggplot(aes(PC1, PC3, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC2 = pc$x[,2], PC3 = pc$x[,3], color = tumor_labels) %>%
ggplot(aes(PC2, PC3, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC4 = pc$x[,4], PC5 = pc$x[,5], color = tumor_labels) %>%
ggplot(aes(PC4, PC5, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
sce = sce_mrna
dat = dat_mrna
# Cells in each cluster are normalized separately
clust = quickCluster(dat)
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
table(clust)
## clust
## 1 2 3
## 176 169 173
deconv = computeSumFactors(dat, cluster = clust)
## Warning: use 'calculateSumFactors' for any 'x' that is not a
## SingleCellExperiment
X = dat %>%
as.matrix() %>%
sweep(., 2, deconv, "/") %>% # Divide by library size
t()
X = log2(X + 1) # Apply log transformation
# Remove columns (transcripts) with zero variance
nonzero_var_cols = which(apply(X, 2, var) != 0)
X = X[, nonzero_var_cols]
# Run approximate PCA (8 seconds)
tic("approx PCA")
pc = runPCA(X, rank = 5, BSPARAM = IrlbaParam())
toc()
## approx PCA: 1.404 sec elapsed
# set.seed(1)
# Run random PCA (2 minutes)
# tic("random PCA")
# pc = runPCA(X, rank = 5, BSPARAM = RandomParam())
# toc()
# Run PCA with prcomp (~10 minutes)
# pc = prcomp(X, center = T, scale = F)
# % variance explained by each PC
# fviz_eig(pc)
tumor_labels = gsub("-.*$", "", colnames(sce))
# Plot PC
data.frame(PC1 = pc$x[,1], PC2 = pc$x[,2], color = tumor_labels) %>%
ggplot(aes(PC1, PC2, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC1 = pc$x[,1], PC3 = pc$x[,3], color = tumor_labels) %>%
ggplot(aes(PC1, PC3, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC2 = pc$x[,2], PC3 = pc$x[,3], color = tumor_labels) %>%
ggplot(aes(PC2, PC3, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC4 = pc$x[,4], PC5 = pc$x[,5], color = tumor_labels) %>%
ggplot(aes(PC4, PC5, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
# Takes a long time, may want to try on cluster
tic("glm PCA")
glmpca(dat %>% as.matrix(), L = 5, fam = c("poi"))
toc()